Amiga CD32 Gamer 22

home *** CD-ROM | disk | FTP | other *** search

/ Amiga CD32 Gamer 22 / CD32 Gamer - 1996 - Issue 22.iso / fli / src / c2p_040.s < prev next >

Wrap

Text File | 1995-01-03 | 9KB | 396 lines

; Chunky2Planar algorithm, originally by James McCoull ; Modified by Peter McGavin for variable size and depth ; and "dirty list" (hope I didn't slow it down too much) ; ; Cpu only solution VERSION 2 ; Optimised for 040+fastram ; bitplanes are assumed contiguous! ; analyse instruction offsets to check performance ;void __asm c2p_8_040 (register __a0 UBYTE *chunky_data, ; register __a1 PLANEPTR raster, ; register __a2 UBYTE *dirty_list, ; register __d1 ULONG plsiz); ; a0 -> width*height chunky pixels ; a1 -> contiguous bitplanes ; a2 -> dirty list (1-byte flag for whether each 32 pixel "unit" needs updating) ; d1 = width*height/8 (width*height must be a multiple of 32) ifeq depth-8 xdef _c2p_8_040 _c2p_8_040: else ifeq depth-6 xdef _c2p_6_040 _c2p_6_040: else ifeq depth-4 xdef _c2p_4_040 _c2p_4_040: else fail "unsupported depth!" endc endc endc merge macro ; in1,in2,tmp3,tmp4,mask,shift ; \1 = abqr ; \2 = ijyz move.l \2,\4 move.l #\5,\3 and.l \3,\2 ; \2 = 0j0z and.l \1,\3 ; \3 = 0b0r eor.l \3,\1 ; \1 = a0q0 eor.l \2,\4 ; \4 = i0y0 ifeq \6-1 add.l \3,\3 else lsl.l #\6,\3 ; \3 = b0r0 endc lsr.l #\6,\4 ; \4 = 0i0y or.l \3,\2 ; \2 = bjrz or.l \4,\1 ; \1 = aiqy endm merge4 macro ; in1,in2,tmp3,tmp4,mask ifgt depth-4 merge \1,\2,\3,\4,\5,4 else move.l #\5,\3 ; \3 = mask and.l \3,\2 ; \2 = 0j0z and.l \1,\3 ; \3 = 0b0r lsl.l #4,\3 ; \3 = b0r0 or.l \3,\2 ; \2 = bjrz endc endm start: ; jmp next ; self-modified code here ;next: ; movem.l d1/a0-a2,-(sp) ;; relocate c2p to a 16-aligned address ; lea (c2p,pc),a0 ; move.l a0,d0 ; and.b #%11110000,d0 ; move.l d0,a1 ; ;; patch jmp ; move.l d0,start+2 ; move.w #(end-c2p)-1,d0 ;loop: move.b (a0)+,(a1)+ ; dbra d0,loop ; ;; flush cache ; move.l (4).w,a6 ; jsr (_LVOCacheClearU,a6) ; ;; restore parameters and restart ; movem.l (sp)+,d1/a0-a2 ; bra.b start ; ; ds.w 8 ; space for relocation ; the real c2p routine starts here c2p: movem.l d2-d7/a2-a6,-(sp) sub.w #44,sp ; space for temporary variables ; a0 = chunky buffer ; a1 = output area ; a2 = dirty list ; d1 = plsiz movea.l d1,a3 ; a3 = plsiz move.l a0,a4 lsl.l #3,d1 add.l d1,a4 ; a4 -> end of chunky data first_loop: tst.l (a2)+ ; do the next 128 pixels need updating? bne.b first_patch ; branch if yes adda.w #128,a0 ; skip 128 pixels on input adda.w #16,a1 ; skip 128 pixels on output cmpa.l a0,a4 bcc.b first_loop bra.w exit ; exit if no changes found first_patch: subq.l #4,a2 ; restore input address tst.b (a2)+ ; do the next 32 pixels need updating? bne.b first_case ; branch if yes adda.w #32,a0 ; skip 32 pixels on input addq.l #4,a1 ; skip 32 pixels on output tst.b (a2)+ ; do the next 32 pixels need updating? bne.b first_case ; branch if yes adda.w #32,a0 ; skip 32 pixels on input addq.l #4,a1 ; skip 32 pixels on output tst.b (a2)+ ; do the next 32 pixels need updating? bne.b first_case ; branch if yes adda.w #32,a0 ; skip 32 pixels on input addq.l #4,a1 ; skip 32 pixels on output tst.b (a2)+ ; do the next 32 pixels need updating? bne.b first_case ; branch if yes adda.w #32,a0 ; skip 32 pixels on input addq.l #4,a1 ; skip 32 pixels on output bra.b first_loop ; this should never happen first_case: move.l (0,a0),d1 move.l (4,a0),d3 move.l (8,a0),d0 move.l (12,a0),d2 move.l (2,a0),d4 move.l (10,a0),d5 move.l (6,a0),d6 move.l (14,a0),d7 move.w (16,a0),d1 move.w (24,a0),d0 move.w (20,a0),d3 move.w (28,a0),d2 move.w (18,a0),d4 move.w (26,a0),d5 move.w (22,a0),d6 move.w (30,a0),d7 adda.w #32,a0 move.l d6,a5 move.l d7,a6 merge d1,d0,d6,d7,$00ff00ff,8 merge d3,d2,d6,d7,$00ff00ff,8 merge4 d1,d3,d6,d7,$0f0f0f0f,4 merge4 d0,d2,d6,d7,$0f0f0f0f,4 exg d1,a5 exg d0,a6 merge d4,d5,d6,d7,$00ff00ff,8 merge d1,d0,d6,d7,$00ff00ff,8 merge4 d4,d1,d6,d7,$0f0f0f0f,4 merge4 d5,d0,d6,d7,$0f0f0f0f,4 merge d3,d1,d6,d7,$33333333,2 merge d2,d0,d6,d7,$33333333,2 merge d3,d2,d6,d7,$55555555,1 merge d1,d0,d6,d7,$55555555,1 move.l d0,(0*4,sp) ;plane0 (movem.l is slower!) move.l d1,(1*4,sp) ;plane1 move.l d2,(2*4,sp) ;plane2 move.l d3,(3*4,sp) ;plane3 ifgt depth-4 move.l a5,d3 move.l a6,d2 merge d3,d4,d6,d7,$33333333,2 merge d2,d5,d6,d7,$33333333,2 ifgt depth-6 merge d3,d2,d6,d7,$55555555,1 endc merge d4,d5,d6,d7,$55555555,1 move.l d5,(4*4,sp) ;plane4 move.l d4,(5*4,sp) ;plane5 ifgt depth-6 move.l d2,(6*4,sp) ;plane6 move.l d3,(7*4,sp) ;plane7 endc endc move.l a1,(32,sp) ; save output address addq.l #4,a1 ; skip 32 pixels on output cmpa.l a0,a4 beq.w final_case main_loop: tst.l (a2)+ ; do the next 128 pixels need updating? bne.b main_patch ; branch if yes adda.w #128,a0 ; skip 128 pixels on input adda.w #16,a1 ; skip 128 pixels on output cmpa.l a0,a4 bcc.b main_loop bra.w final_case ; exit if no changes found main_patch: subq.l #4,a2 ; restore input address tst.b (a2)+ ; do the next 32 pixels need updating? beq.b 1$ ; branch if no bsr.b main_case 1$: adda.w #32,a0 ; skip 32 pixels on input addq.l #4,a1 ; skip 32 pixels on output tst.b (a2)+ ; do the next 32 pixels need updating? beq.b 2$ ; branch if no bsr.b main_case 2$: adda.w #32,a0 ; skip 32 pixels on input addq.l #4,a1 ; skip 32 pixels on output tst.b (a2)+ ; do the next 32 pixels need updating? beq.b 3$ ; branch if no bsr.b main_case 3$: adda.w #32,a0 ; skip 32 pixels on input addq.l #4,a1 ; skip 32 pixels on output tst.b (a2)+ ; do the next 32 pixels need updating? beq.b 4$ ; branch if no bsr.b main_case 4$: adda.w #32,a0 ; skip 32 pixels on input addq.l #4,a1 ; skip 32 pixels on output cmpa.l a0,a4 bcc.b main_loop bra.w final_case ; exit if no changes found main_case: move.l a1,(36+4,sp) ; save current output address move.l (32+4,sp),a1 ; a1 = previous output address move.l (0,a0),d1 move.l (4,a0),d3 move.l (8,a0),d0 move.l (12,a0),d2 move.l (2,a0),d4 move.l (10,a0),d5 move.l (6,a0),d6 move.l (14,a0),d7 move.w (16,a0),d1 move.w (24,a0),d0 move.w (20,a0),d3 move.w (28,a0),d2 move.w (18,a0),d4 move.w (26,a0),d5 move.w (22,a0),d6 move.w (30,a0),d7 move.l d6,a5 move.l d7,a6 move.l (0*4+4,sp),(a1) ;plane0 adda.l a3,a1 ;a1+=plsiz merge d1,d0,d6,d7,$00ff00ff,8 merge d3,d2,d6,d7,$00ff00ff,8 move.l (1*4+4,sp),(a1) ;plane1 adda.l a3,a1 ;a1+=plsiz merge d1,d3,d6,d7,$0f0f0f0f,4 merge d0,d2,d6,d7,$0f0f0f0f,4 exg d1,a5 exg d0,a6 move.l (2*4+4,sp),(a1) ;plane2 adda.l a3,a1 ;a1+=plsiz merge d4,d5,d6,d7,$00ff00ff,8 merge d1,d0,d6,d7,$00ff00ff,8 move.l (3*4+4,sp),(a1) ;plane3 adda.l a3,a1 ;a1+=plsiz merge d4,d1,d6,d7,$0f0f0f0f,4 merge d5,d0,d6,d7,$0f0f0f0f,4 ifgt depth-4 move.l (4*4+4,sp),(a1) ;plane4 adda.l a3,a1 ;a1+=plsiz endc merge d3,d1,d6,d7,$33333333,2 merge d2,d0,d6,d7,$33333333,2 ifgt depth-4 move.l (5*4+4,sp),(a1) ;plane5 adda.l a3,a1 ;a1+=plsiz endc merge d3,d2,d6,d7,$55555555,1 merge d1,d0,d6,d7,$55555555,1 move.l d0,(0*4+4,sp) ;plane0 (movem.l is slower!) move.l d1,(1*4+4,sp) ;plane1 move.l d2,(2*4+4,sp) ;plane2 move.l d3,(3*4+4,sp) ;plane3 ifgt depth-4 move.l a5,d3 move.l a6,d2 ifgt depth-6 move.l (6*4+4,sp),(a1) ;plane6 adda.l a3,a1 ;a1+=plsiz endc merge d3,d4,d6,d7,$33333333,2 merge d2,d5,d6,d7,$33333333,2 ifgt depth-6 move.l (7*4+4,sp),(a1) ;plane7 adda.l a3,a1 ;a1+=plsiz endc ifgt depth-6 merge d3,d2,d6,d7,$55555555,1 endc merge d4,d5,d6,d7,$55555555,1 move.l d5,(4*4+4,sp) ;plane4 move.l d4,(5*4+4,sp) ;plane5 ifgt depth-6 move.l d2,(6*4+4,sp) ;plane6 move.l d3,(7*4+4,sp) ;plane7 endc endc movea.l (36+4,sp),a1 ; restore current output address move.l a1,(32+4,sp) ; save output address rts final_case: move.l (32,sp),a1 ; a1 = previous output address move.l (0*4,sp),(a1) ;plane0 adda.l a3,a1 ;a1+=plsiz move.l (1*4,sp),(a1) ;plane1 adda.l a3,a1 ;a1+=plsiz move.l (2*4,sp),(a1) ;plane2 adda.l a3,a1 ;a1+=plsiz move.l (3*4,sp),(a1) ;plane3 ifgt depth-4 adda.l a3,a1 ;a1+=plsiz move.l (4*4,sp),(a1) ;plane4 adda.l a3,a1 ;a1+=plsiz move.l (5*4,sp),(a1) ;plane5 ifgt depth-6 adda.l a3,a1 ;a1+=plsiz move.l (6*4,sp),(a1) ;plane6 adda.l a3,a1 ;a1+=plsiz move.l (7*4,sp),(a1) ;plane7 endc endc exit: add.w #44,sp movem.l (sp)+,d2-d7/a2-a6 rts cnop 0,4 end: end